unit FParse;
{ All source code Copyright 1996 Kevin L. Boylan }

{
	TFParse will parse words (as defined by the developer or user) from a file, s string, or
	a PChar.  For performance purposes, all parsing is done in memory.  If you are parsing from
	a file and the file is > 64K, it will be buffered in 64K chunks.  If you are developing in
  the Delphi 2 environment, the file will be buffered in 500,000 byte chunks.

	What is to be parsed is determined by setting one of the following three properties.
	Setting one	of these properties will override any previous settings of one of the three:

	FileToParse:			   The path and filename of a file that you wish to parsed.
	StringToParse:				A String variable that you wish to parse.
	PCharToParse:				A PChar variable that you wish to parse.

	What constitutes a word is defined through these properties:

	NormalCharacters: 		The set of characters that can go into a word.
	SignificantCharacters:	The set of characters that can go into a word only if surrounded
									by NormalCharacters.
	InsignificantCharacters:The set of characters that will be ignored and removed from between
									two NormalCharacters.
	MIN_WORD_LENGTH:			The minimum length of words that will be accepted.
	MAX_WORD_LENGTH:			The maximum length of words returned.  Longer words are truncated.
	CommonWords:				This list of words will be ingored, not returned. If you don't want
									to make use of this property, you can set the property CmnWrdsActive
									to False.

	Once these properties are set, you then repeatedly call GetNext, which returns a string
	containing the next word from the file, string, or PChar.  When GetNext returns a null
	string ('') then all words have been parsed.

	5/12/96			Released version 1.0
	5/15/96			Modified CreateCharSets, changing for loops to while loops because
						the counter was being modified inside the loops with inc(i,2).  This
						problem kept the component from installing under Delphi 2.0  This
						modification resulted in version 1.1.
	6/3/96			Fixed bugs which kept strings and PChars from being parsed.  Also fixed
						a problem where the PercentDone property could be greater then 100% by
                 adding the MinLongInt function.  version 1.2
	7/22/96			Added HTML parsing capabilites (minus special character handling) and condensed
						CreateCharSets member function by consolidating code into an internal procedure;
  7/23/96			Added special character (i.e. &quot;)handling capabilites to HTML parsing.
  8/12/96			Added OnHTMLTag event
	8/13/96			Added AutoDetect capability
  8/16/96			Submitted version 2.0
  8/2196			Fixed bugs dealing with checking for the end of the buffer.  Also added the
  					Lower_Case property.
}

interface

uses
	SysUtils, WinTypes, WinProcs, Messages, Classes, Graphics, Controls,
	Forms, Dialogs;

type
  EDocType = (dtAscii, dtHTML);
  TCharSet = Set of Char;
  EOFileException = class(Exception);
  THTMLTagEvent = procedure(HTMLTag: String) of object;

	TFParse = class(TComponent)
  private
	  { Private declarations }
	  FFileToParse: String;
	  theFile: TFileStream;
	  ByteCount: LongInt;
	  TotalSize: LongInt;
	  FBuffer,CurrChar,EOBuffer: PChar;
	  BuffLen: LongInt;
	  NormChar,SigChar,InSigChar: TCharSet;
	  FNormalCharacters: String;
	  FSignificantCharacters: String;
	  FInsignificantChars: String;
	  FCommonWords: TStrings;
    FMatchPatterns: TStrings;
	  FCmnWrdsActive: Boolean;
	  FMIN_WORD_LENGTH: Integer;
	  FMAX_WORD_LENGTH: Integer;
    FDocumentType: EDocType;
    FOnHTMLTag: THTMLTagEvent;
    FAutoDetect: Boolean;
    FLower_Case: Boolean;
	protected
	  { Protected declarations }
	  procedure SetFileToParse( FName: String );
	  procedure SetStringToParse( theStr: String );
	  procedure SetPCharToParse( thePChar: PChar );
	  function  GetPercentDone: LongInt;
	  function  NextChar: Boolean;
	  procedure Loaded; override;
	  procedure CreateCharSets;
	  procedure SetCommonWords(Value: TStrings);
    procedure SetMatchPatterns( Value: TStrings );
	  procedure Init;
    function	DetermineType: EDocType;
	public
	  { Public declarations }
	  constructor Create( AOwner: TComponent ); override;
	  destructor Destroy; override;

	  property StringToParse: String write SetStringToParse;
	  property PCharToParse: PChar write SetPCharToParse;
	  function GetNext: String;
	  property PercentDone: LongInt read GetPercentDone;
	published
	  { Published properties }
	  property FileToParse: String read FFileToParse write SetFileToParse;
	  property MIN_WORD_LENGTH: Integer read FMIN_WORD_LENGTH write FMIN_WORD_LENGTH default 1;
	  property MAX_WORD_LENGTH: Integer read FMAX_WORD_LENGTH write FMAX_WORD_LENGTH default 20;
	  property NormalCharacters: String read FNormalCharacters write FNormalCharacters;
	  property SignificantCharacters: String read FSignificantCharacters
	  												 						write FSignificantCharacters;
	  property InsignificantChars: String read FInsignificantChars write FInsignificantChars;
	  property CommonWords: TStrings read FCommonWords write SetCommonWords;
    property MatchPatterns: TStrings read FMatchPatterns write SetMatchPatterns;
	  property CmnWrdsActive: Boolean read FCmnWrdsActive write FCmnWrdsActive;
    property DocumentType: EDocType read FDocumentType write FDocumentType default dtAscii;
    property AutoDetect: Boolean read FAutoDetect write FAutoDetect default False;
    property Lower_Case: Boolean read FLower_Case write FLower_Case default True;
    { Published events }
    property OnHTMLTag: THTMLTagEvent read FOnHTMLTag write FOnHTMLTag;
	end;


procedure Register;
function MinLongInt( Long1, Long2: LongInt ): LongInt;

implementation

const
 {$IFDEF WIN32}
	  MAX_WORD = 500000;
 {$ELSE}
    MAX_WORD = 65526;
 {$ENDIF}
 SKIP_CHAR = #1;

constructor TFParse.Create( AOwner: TComponent );
begin
	Inherited Create( AOwner );
	MIN_WORD_LENGTH := 1;					{ Default minimum word length }
	MAX_WORD_LENGTH := 20;      			{ Default maximum word length }
	FCmnWrdsActive := False;     			{ By default, CommonWords is not active }
  FAutoDetect := False;               { By default, AutoDetect is false }
  FLower_Case := True;						{ By default, we lowercase everything }
  FDocumentType := dtAscii;				{ Plain text file is default }
	If (csDesigning In ComponentState) then
	 begin
		NormalCharacters := '0-9A-Za-z'; { Default chars that make words }
		SignificantCharacters := './';   { Default chars that can be inside words only }
		InsignificantChars := ',';       { Default chars that will be stripped from words }
	 end;
	 FCommonWords := TStringList.Create;
   FMatchPatterns := TStringList.Create;
end;

destructor TFParse.Destroy;
begin
	theFile.Free;
	StrDispose( FBuffer );
	FCommonWords.Free;
  FMatchPatterns.Free;
	Inherited Destroy;
end;

procedure TFParse.Loaded;
begin
	Inherited Loaded;
	if not (csDesigning In ComponentState) then
		CreateCharSets;  { Create the Sets of Chars }
end;

procedure TFParse.SetCommonWords(Value: TStrings);
begin
	FCommonWords.Assign(Value);
end;

procedure TFParse.SetMatchPatterns(Value: TStrings);
begin
	FMatchPatterns.Assign(Value);
end;

procedure TFParse.CreateCharSets;
{ Converts from the string representation of the character sets to real
	sets of char }
 procedure CreateSet( var CharSet: TCharSet; SetDef: String );
 var
	i: Integer;
	j: Char;
 begin
  CharSet := [];
  i := 1;
	While i <= Length(SetDef) do  { Read each character }
	 begin
		If (i < Length(SetDef)-1) and (SetDef[i+1] = '-') then
		 begin      { Handle ranges }
			For j := SetDef[i] to SetDef[i+2] do
				CharSet := CharSet + [j];    { Add each in range to set }
			Inc(i,2);
		 end
		else
		 begin
			CharSet := CharSet + [SetDef[i]];    { Add to set }
		 end;
		Inc(i,1);
	 end;
 end;  { procedure CreateSet }

begin
	If NormalCharacters = '' then
		NormalCharacters := '0-9A-Za-z';  					{ If none set, then use the default }
	CreateSet( NormChar, NormalCharacters );				{Create Normal Character Set }
  CreateSet( SigChar, SignificantCharacters ); 		{Create Significant Character Set }
  CreateSet( InSigChar, InsignificantChars );			{Create InSignificant Character Set }
end;

procedure TFParse.Init;
{ Initializes variables prior to any parsing }
begin
	CurrChar := FBuffer-1; { Will start out incrementing CurrChar }
	ByteCount := 0;
	{ EOBuffer := FBuffer + TotalSize }  { Removed 7-27-97 KLB }
	EOBuffer := FBuffer;                 { Added 7-27-97 KLB }
	Inc(EOBuffer,BuffLen-1);             { Added 7-27-97 KLB }
	EOBuffer^ := #0;
end;

function TFParse.DetermineType: EDocType;
{ Determines what type of document (i.e. Ascii or HTML) has been loaded in
	the buffer. }
const
	DetectStringSize = 50;
var
	DetectStr: String[DetectStringSize];
  i: PChar;
begin
	Result := dtAscii;		{ assume ascii }
	i := FBuffer;
  DetectStr := '';
  While ((i<EOBuffer) and (i<FBuffer+DetectStringSize)) do
   begin
    	DetectStr := DetectStr + i^;
     Inc(i);
   end;
  DetectStr := LowerCase(DetectStr);
  { --- Check for HTML }
  If (ExtractFileExt( FFileToParse ) = '.htm' )
  	or ((Pos('<', DetectStr) > 0) and (Pos('html', DetectStr) > 0)) then
  	  	Result := dtHTML;
end;

procedure TFParse.SetFileToParse( FName: String );
{ Opens the file to be parsed.  Maximum block size to be read in at one time is
	MAX_WORD which should be close to 64K for 16 bit and much greater for 32 bit }
begin
	theFile.Free;
	try
		theFile := TFileStream.Create( FName, fmOpenRead );
		If (theFile.Size > MAX_WORD) then
			BuffLen := MAX_WORD
		else
			BuffLen := theFile.Size+1;
		TotalSize := theFile.Size;
		StrDispose(FBuffer);
     FBuffer := nil;
		FBuffer := StrAlloc(BuffLen);
		Init;
		theFile.Read( FBuffer^, BuffLen-1 );
     If AutoDetect then
     	FDocumentType := DetermineType;
	except
		On EFOpenError do
			raise EFOpenError.CreateFmt( 'File %s could not be found', [FName] );
	end;  { except }
end;

procedure TFParse.SetStringToParse( theStr: String );
{ Sets the buffer to be parsed from a String instead of from a file }
begin
	FFileToParse := '';
	TotalSize := Length(theStr);
	BuffLen := TotalSize + 1;
	StrDispose(FBuffer);
  FBuffer := nil;
	FBuffer := StrAlloc( BuffLen );
	StrPCopy( FBuffer, theStr );
	Init;
  If AutoDetect then
    	FDocumentType := DetermineType;
end;

procedure TFParse.SetPCharToParse( thePChar: PChar );
{ Sets the buffer to be parsed from a PChar instead of from a file }
begin
	FFileToParse := '';
	TotalSize := StrLen( thePChar );
	BuffLen := TotalSize + 1;
	StrDispose(FBuffer);
  FBuffer := nil;
	FBuffer := StrAlloc( BuffLen );
	StrCopy( FBuffer, thePChar );
	Init;
  If AutoDetect then
    	FDocumentType := DetermineType;
end;

function TFParse.GetNext: String;
{ returns the next word from the file }
var
  tmpWord: String[255];
  tmpChar: PChar;

  procedure GetHTMLLiteral;
  { It is assumed that tmpChar is pointing at a '&' }
  const
  	LiteralStringSize = 3;
     LiteralEnd: TCharSet = [';',' ','<'];
     NonStandardLiteralEnd: TCharSet = [' ','<'];
  var
  	AStr: String[LiteralStringSize+2];
     tmptmpChar: PChar;
     i: Integer;
  begin
  	AStr := '';
     If tmpChar^ = '#' then { its an ascii representation }
      begin
  	   tmpChar^ := SKIP_CHAR;  { blank the '&' }
        Inc(tmpChar);
      	tmpChar^ := SKIP_CHAR;  { blank the '#' }
        Inc(tmpChar);
      	While (tmpChar < EOBuffer)
              and not(tmpChar^ in LiteralEnd) and (Length(AStr) <= LiteralStringSize) do
         begin
           If tmpChar^ > #31 then
              AStr := AStr + tmpChar^;
           tmpChar^ := SKIP_CHAR;
           Inc(tmpChar);
         end;
        If tmpChar^ in NonStandardLiteralEnd then
           Dec(tmpChar);
        tmpChar^ := Chr( StrToInt(AStr) );
      end
     Else                   { its an entity code-word }
      begin
        { first check to see if there's a ';' close by }
        tmptmpChar := tmpChar;
        Inc(tmptmpChar);
        If tmptmpChar^ = ' ' then
           exit;
        i := 1;
        While (tmptmpChar < EOBuffer) and not(tmptmpChar^ in LiteralEnd) and (i<10) do
         begin
           Inc(tmptmpChar);
           Inc(i);
         end;
        If (i > 10) or not (tmptmpChar^ in LiteralEnd) then
           exit;
  	   tmpChar^ := SKIP_CHAR;   { blank the '&' }
        Inc(tmpChar);
      	AStr := tmpChar^;        { save first char }
        tmpChar^ := SKIP_CHAR;   { and blank it }
        Inc(tmpChar);
        While (tmpChar < EOBuffer) and not (tmpChar^ in LiteralEnd) do
         begin
           If tmpChar^ > #31 then
              AStr := AStr + tmpChar^;
           tmpChar^ := SKIP_CHAR;
           Inc(tmpChar);
         end;
        If tmpChar^ in NonStandardLiteralEnd then
           Dec(tmpChar);
        AStr := LowerCase(AStr);
        If AStr = 'amp' then
        	tmpChar^ := '&'
        Else If AStr = 'gt' then
        	tmpChar^ := '>'
        Else If AStr = 'lt' then
        	tmpChar^ := '<'
        Else If AStr = 'quot' then
        	tmpChar^ := '"'
        Else If AStr = 'nbsp' then
           tmpChar^ := #160
        Else If AStr = 'copy' then
           tmpChar^ := #169
        Else If AStr = 'reg' then
           tmpChar^ := #174
        Else
        	tmpChar^ := ' ';    { we don't handle any others at this time }
      end;
  end;

  procedure ClearHTML;
  { Zero's out irrelevant HTML code so that parsing will ignore }
  var
  	HTMLTag: String;
  begin
  	tmpChar := CurrChar;
     HTMLTag := '';
     If tmpChar^ = '<' then
     Repeat
       	While True do
         begin
       		If (tmpChar^ = '>') then 				{ End on the '>' character }
            begin
            	if Assigned(FOnHTMLTag) then     { Handle the OnHTMLTag event }
               begin
                 HTMLTag := HTMLTag + tmpChar^;
                 FOnHTMLTag( HTMLTag );
                 HTMLTag := '';
               end;
            	tmpChar^ := SKIP_CHAR;                   { Zero out so will be ignored }
              {Inc(tmpChar);}
             	Break;
            end;
           If Assigned(FOnHTMLTag) then         { Collect tag contents if OnHTMLTag is set }
              If tmpChar^ > #31 then
           	   HTMLTag := HTMLTag + tmpChar^;
           tmpChar^ := SKIP_CHAR;
         	Inc(tmpChar);
         end;
     Until (tmpChar^ <> '<')
     Else
       GetHTMLLiteral;                         {It's a literal, not a Tag }
  end;

begin
 try  { EOFileException is raised when end of buffer is reached }
  tmpChar := nil;
	Repeat
		Repeat
			Result := '';
        tmpWord := '';
			While (NextChar) do  { Find next NormChar to start a word }
         begin  { First check to see if it's HTML stuff to skip }
         	{ Begin HTML Check ***** }
         	If ((DocumentType = dtHTML) and (tmpChar < CurrChar)) and ((CurrChar^ = '<') or (CurrChar^ = '&')) then
           	If Length(tmpWord) > 0 then
               begin
               	Dec(CurrChar);
                 Dec(ByteCount);
              	Break;
               end
              Else
           		ClearHTML;
           { End HTML Check ***** }

           If (CurrChar^ = SKIP_CHAR) or (CurrChar^ in InSigChar) then
           	Continue;
				If (CurrChar^ in NormChar) or
            ((CurrChar^ in SigChar) and ((CurrChar+1)^ in NormChar)) then
            begin
            	If Length(tmpWord) < 255 then
						tmpWord := tmpWord + CurrChar^;
            end
           Else
           	Break;
         end;
		Until Length( tmpWord ) >= MIN_WORD_LENGTH;
     If FLower_Case then
     	tmpWord := LowerCase(tmpWord);
		Result := Copy(tmpWord,1,MAX_WORD_LENGTH);
	Until ( (not CmnWrdsActive) or (FCommonWords.IndexOf( Result ) = -1) )

 except
 	on EOFileException do   { End of buffer has been reached }
   begin
   	StrDispose( FBuffer );
		FBuffer := nil;
     theFile.Free;
     theFile := nil;
		exit;
   end;
 end;

end;

function TFParse.NextChar: Boolean;
{ points at the next character in the buffer and reacts according to what's there }
var
	AmountToRead,
	AmountRead:		LongInt;
begin
	Result := True;
	Inc(CurrChar);
	Inc(ByteCount);
	If CurrChar <= EOBuffer then
		exit;
	If ByteCount > TotalSize then
	  	raise EOFileException.Create('End of File');
	CurrChar := FBuffer;
	AmountToRead := BuffLen - 1;
 	FBuffer[0] := #0;
	AmountRead := theFile.Read( (FBuffer+StrLen(FBuffer))^, AmountToRead );
	If AmountRead < AmountToRead then
	 begin
		EOBuffer := CurrChar;
		Inc(EOBuffer, AmountRead);
		EOBuffer^ := #0;
	 end;
end;

{ Modified this method 7-27-97 KLB }
function TFParse.GetPercentDone: LongInt;
var
  R,S: Extended;
begin
	If (TotalSize > 0) and (ByteCount > 0) then
	 begin
		S := ByteCount;
     S := S * 100;
		R := S/TotalSize;
   end
  else
     R := 0;
	Result := MinLongInt(Round(R),100);
end;

function MinLongInt( Long1, Long2: LongInt ): LongInt;
{ returns the smallest of two long integers }
begin
	If Long1 < Long2 then
		result := Long1
	Else
		result := Long2;
end;

procedure Register;
{ Registers the component }
begin
	RegisterComponents('Samples', [TFParse]);
end;

end.
